Code
import pandas as pd
eda = pd.read_parquet("data/eda.parquet")# identifying data analyst jobs by keyword searching
keywords = ['Data Analyst', 'Business Analyst', 'Data Engineering', 'Deep Learning',
'Data Science', 'Data Analysis','Data Analytics', 'Market Research Analyst'
'LLM', 'Language Model', 'NLP', 'Natural Language Processing',
'Computer Vision', 'Business Intelligence Analyst', 'Quantitative Analyst', 'Operations Analyst']
match = lambda col: eda[col].str.contains('|'.join(keywords), case=False, na=False)
eda['DATA_ANALYST_JOB'] = match('TITLE_NAME') \
| match('SKILLS_NAME') \
| match('SPECIALIZED_SKILLS_NAME')
eda['DATA_ANALYST_JOB'].value_counts()DATA_ANALYST_JOB
False 38212
True 33042
Name: count, dtype: int64
import plotly.graph_objects as go
from plotly.subplots import make_subplots
df_grouped = (
eda
.groupby(['DATA_ANALYST_JOB','NAICS2_NAME'])
.size()
.reset_index(name='Job_Count')
)
short_names = {
'Professional, Scientific, and Technical Services': 'Tech. Services',
'Administrative and Support and Waste Management and Remediation Services': 'Admin & Waste Mgmt',
'Health Care and Social Assistance': 'Healthcare',
'Finance and Insurance': 'Finance',
'Information': 'Info Tech',
'Educational Services': 'Education',
'Manufacturing': 'Manufacturing',
'Retail Trade': 'Retail',
'Accommodation and Food Services': 'Hospitality',
'Other Services (except Public Administration)': 'Other Services'
}
df_grouped['Industry'] = df_grouped['NAICS2_NAME'].map(short_names).fillna(df_grouped['NAICS2_NAME'])
df_grouped['Job_Type'] = df_grouped['DATA_ANALYST_JOB'].map({True:'True', False:'False'})
pivot = (
df_grouped
.pivot_table(index='Industry', columns='Job_Type', values='Job_Count', fill_value=0)
.reset_index()
)
industries = pivot['Industry'].tolist()
y_true = pivot['True'].tolist()
y_false = pivot['False'].tolist()
# 2) Build a 2-row subplot: bar on top, table below
fig = make_subplots(
rows=2, cols=1,
row_heights=[0.70, 0.30], # give a bit more room to the table
specs=[[{"type":"bar"}],[{"type":"table"}]],
vertical_spacing=0.12 # more space between bar and table
)
colors = {'True': '#FFE5E5', 'False': '#FF6B6B'}
fig.add_trace(
go.Bar(
x=industries, y=y_true, name='True',
marker=dict(color=colors['True'], line=dict(color='#A81D1D', width=1)),
text=y_true, textposition='outside'
),
row=1, col=1
)
fig.add_trace(
go.Bar(
x=industries, y=y_false, name='False',
marker=dict(color=colors['False'], line=dict(color='#A81D1D', width=1)),
text=y_false, textposition='outside'
),
row=1, col=1
)
# 3) Slider steps: 0 → 8 000 in 200s
steps = []
for val in range(0, 8001, 200):
steps.append(dict(
label=str(val),
method="update",
args=[
{"y": [
[v if v>=val else 0 for v in y_true],
[v if v>=val else 0 for v in y_false]
]},
{"title": f"Min Jobs ≥ {val:,}"}
]
))
# 4) Final layout tweaks
fig.update_layout(
# lift slider above everything
sliders=[dict(
active=0,
currentvalue={"prefix":"Min Jobs: "},
pad={"b":0},
x=0.05,
y=1.05, # move slider way above the plot area
xanchor="left",
yanchor="bottom",
len=0.7,
font=dict(color='#A81D1D'),
steps=steps
)],
title=dict(
text="Data & Business Analytics Job Trends",
font=dict(size=24, color='#A81D1D'),
x=0.5,
y=0.95, # drop the title just below the slider
xanchor="center",
yanchor="top"
),
width=1100, height=850,
margin=dict(l=60, r=60, t=180, b=200), # extra top & bottom margin
plot_bgcolor='white',
paper_bgcolor='white',
xaxis=dict(
title="Industry",
title_font=dict(size=16, color='#A81D1D'),
tickmode='array',
tickvals=list(range(len(industries))),
ticktext=industries,
tickangle=-30,
tickfont=dict(size=11, color='#333'),
showline=True, linecolor='#A81D1D'
),
yaxis=dict(
title="Number of Jobs",
title_font=dict(size=16, color='#A81D1D'),
tickfont=dict(size=11, color='#333'),
gridcolor='rgba(200,200,200,0.3)',
showline=True, linecolor='#A81D1D',
range=[0, max(max(y_true),max(y_false))*1.2]
),
legend=dict(
title="Data Analyst Job",
title_font=dict(color='#A81D1D'),
font=dict(size=12),
x=0.95, y=0.95
),
bargap=0.2
)
fig.write_html(
"figures/edaplot1.html",
include_plotlyjs="cdn", # Use CDN to load Plotly JS
full_html=False # Only include the plot div
)import plotly.express as px
import pandas as pd
# Assuming eda is already loaded and DATA_ANALYST_JOB is defined
df = eda.copy()
# Step 1: Map DATA_ANALYST_JOB to labels
df['Job_Category'] = df['DATA_ANALYST_JOB'].map({True: 'Analytics Job', False: 'Non-Analytics Job'})
# Step 2: Clean the data (remove rows with missing SPECIALIZED_SKILLS_NAME)
df = df.dropna(subset=['SPECIALIZED_SKILLS_NAME'])
# Debug: Check the number of rows after cleaning
print("Number of rows after cleaning:", len(df))
# Step 3: Split the SPECIALIZED_SKILLS_NAME into individual skills
# Assuming SPECIALIZED_SKILLS_NAME is a string of skills separated by commas or another delimiter
df_skills = df.copy()
df_skills['SPECIALIZED_SKILLS_NAME'] = df_skills['SPECIALIZED_SKILLS_NAME'].str.split(',') # Adjust delimiter if needed
df_skills = df_skills.explode('SPECIALIZED_SKILLS_NAME')
df_skills['SPECIALIZED_SKILLS_NAME'] = df_skills['SPECIALIZED_SKILLS_NAME'].str.strip()
# Step 4: Group by skill and Job_Category to get the count
df_skills_count = df_skills.groupby(['SPECIALIZED_SKILLS_NAME', 'Job_Category']).size().reset_index(name='Count')
# Step 5: Get the top 10 skills by total count
top_skills = df_skills_count.groupby('SPECIALIZED_SKILLS_NAME')['Count'].sum().nlargest(10).index
df_skills_top = df_skills_count[df_skills_count['SPECIALIZED_SKILLS_NAME'].isin(top_skills)]
# Debug: Check the grouped data
print("Top 10 specialized skills:")
print(df_skills_top)
# Step 6: Create the bar plot
fig = px.bar(
df_skills_top,
x='Count',
y='SPECIALIZED_SKILLS_NAME',
color='Job_Category',
barmode='stack',
color_discrete_map={'Analytics Job': '#FF6B6B', 'Non-Analytics Job': '#4ECDC4'},
title='Top 10 Specialized Skills by Job Category'
)
# Step 7: Update layout for styling
fig.update_layout(
width=900,
height=600,
plot_bgcolor='white',
paper_bgcolor='white',
font=dict(family='Inter, sans-serif', size=14, color='#2D3748'),
title=dict(
font=dict(size=24, color='#FF6B6B'),
x=0.5,
xanchor='center',
y=0.95,
yanchor='top'
),
xaxis=dict(
title='Number of Jobs',
title_font=dict(size=16),
tickfont=dict(size=12),
gridcolor='#E2E8F0',
linecolor='#2D3748',
linewidth=2,
showline=True,
showgrid=True,
zeroline=False
),
yaxis=dict(
title='Specialized Skill',
title_font=dict(size=16),
tickfont=dict(size=12)
),
legend=dict(
title='Job Category',
font=dict(size=13),
bgcolor='#FFFFFF',
bordercolor='#FF6B6B',
borderwidth=1,
x=1.02,
y=0.5,
xanchor='left',
yanchor='middle'
)
)
# Save to HTML
fig.write_html(
'figures/edaplot2.html',
include_plotlyjs='cdn',
full_html=False
)Number of rows after cleaning: 71254
Top 10 specialized skills:
SPECIALIZED_SKILLS_NAME Job_Category Count
2071 "Business Intelligence" Analytics Job 8077
2072 "Business Intelligence" Non-Analytics Job 1778
2167 "Business Process" Analytics Job 4417
2168 "Business Process" Non-Analytics Job 8385
2180 "Business Requirements" Analytics Job 4972
2181 "Business Requirements" Non-Analytics Job 7740
4151 "Dashboard" Analytics Job 9975
4152 "Dashboard" Non-Analytics Job 1559
4173 "Data Analysis" Analytics Job 25620
6640 "Finance" Analytics Job 5573
6641 "Finance" Non-Analytics Job 6220
13332 "Project Management" Analytics Job 6236
13333 "Project Management" Non-Analytics Job 6939
13594 "Python (Programming Language)" Analytics Job 10127
13595 "Python (Programming Language)" Non-Analytics Job 1703
14459 "SAP Applications" Analytics Job 2174
14460 "SAP Applications" Non-Analytics Job 9149
14686 "SQL (Programming Language)" Analytics Job 16067
14687 "SQL (Programming Language)" Non-Analytics Job 4202
import plotly.express as px
import pandas as pd
# Assuming eda is already loaded and DATA_ANALYST_JOB is defined
df = eda.copy()
# Step 1: Calculate average years of experience
df['Avg_Years_Experience'] = (df['MIN_YEARS_EXPERIENCE'] + df['MAX_YEARS_EXPERIENCE']) / 2
# Step 2: Clean the data (remove rows with missing values for the variables we need)
df = df.dropna(subset=['SALARY', 'Avg_Years_Experience', 'REMOTE_TYPE_NAME', 'MIN_EDULEVELS_NAME'])
# Debug: Check the number of rows after cleaning
print("Number of rows after cleaning:", len(df))
# Step 3: Map DATA_ANALYST_JOB to labels
df['Job_Category'] = df['DATA_ANALYST_JOB'].map({True: 'Analytics Job', False: 'Non-Analytics Job'})
# Step 4: Encode categorical variables numerically for the parallel coordinates plot
# Encode REMOTE_TYPE_NAME (e.g., Remote=2, Hybrid=1, On-site=0)
remote_mapping = {'Remote': 2, 'Hybrid': 1, 'On-site': 0} # Adjust based on your unique values
df['Remote_Type_Code'] = df['REMOTE_TYPE_NAME'].map(remote_mapping)
# Encode MIN_EDULEVELS_NAME (e.g., High School=0, Bachelor's=1, Master's=2, PhD=3)
edu_mapping = {'High School': 0, 'Bachelor\'s': 1, 'Master\'s': 2, 'PhD': 3} # Adjust based on your unique values
df['Edu_Level_Code'] = df['MIN_EDULEVELS_NAME'].map(edu_mapping)
# Step 5: Clean again after encoding (in case some mappings failed)
df = df.dropna(subset=['Remote_Type_Code', 'Edu_Level_Code'])
# Debug: Check the encoded data
print("Sample encoded data:")
print(df[['SALARY', 'Avg_Years_Experience', 'Remote_Type_Code', 'Edu_Level_Code', 'Job_Category']].head())
# Step 6: Create the parallel coordinates plot
fig = px.parallel_coordinates(
df,
dimensions=['SALARY', 'Avg_Years_Experience', 'Remote_Type_Code', 'Edu_Level_Code'],
color_continuous_scale=['#4ECDC4', '#FF6B6B'], # Gradient from teal to red
color=df['Job_Category'].map({'Analytics Job': 1, 'Non-Analytics Job': 0}), # Color by Job_Category
labels={
'SALARY': 'Salary ($)',
'Avg_Years_Experience': 'Avg. Years Experience',
'Remote_Type_Code': 'Remote Type (0=On-site, 1=Hybrid, 2=Remote)',
'Edu_Level_Code': 'Min. Education Level (0=HS, 1=Bachelor\'s, 2=Master\'s, 3=PhD)'
},
title='Multidimensional Comparison of Job Attributes by Job Category'
)
# Step 7: Update layout for styling
fig.update_layout(
width=1000,
height=600,
plot_bgcolor='white',
paper_bgcolor='white',
font=dict(family='Inter, sans-serif', size=14, color='#2D3748'),
title=dict(
font=dict(size=24, color='#FF6B6B'),
x=0.5,
xanchor='center',
y=0.95,
yanchor='top'
)
)
fig.write_html(
"figures/edaplot3.html",
include_plotlyjs="cdn", # Use CDN to load Plotly JS
full_html=False # Only include the plot div
)Number of rows after cleaning: 71254
Sample encoded data:
Empty DataFrame
Columns: [SALARY, Avg_Years_Experience, Remote_Type_Code, Edu_Level_Code, Job_Category]
Index: []
import plotly.express as px
import pandas as pd
# Prepare the data
df = eda.copy()
# Define analytics jobs (Data Analyst + Business Analyst)
def classify_analytics_job(row):
if row['DATA_ANALYST_JOB']:
return True
title = str(row['TITLE_NAME']).lower() if 'TITLE_NAME' in row else str(row['TITLE']).lower()
if 'business analyst' in title:
return True
return False
df['IS_ANALYTICS_JOB'] = df.apply(classify_analytics_job, axis=1)
df['Job_Category'] = df['IS_ANALYTICS_JOB'].map({True: 'Analytics Job', False: 'Non-Analytics Job'})
# Calculate average years of experience
df['Avg_Years_Experience'] = (df['MIN_YEARS_EXPERIENCE'] + df['MAX_YEARS_EXPERIENCE']) / 2
# Clean the data (remove rows with missing salary or experience)
df = df.dropna(subset=['Avg_Years_Experience', 'SALARY'])
# Create the scatter plot with trend line
fig = px.scatter(df,
x='Avg_Years_Experience',
y='SALARY',
color='Job_Category',
trendline='ols', # Add trend line (ordinary least squares)
title='Experience Requirements vs Salary for Analytics Jobs',
labels={'Avg_Years_Experience': 'Average Years of Experience', 'SALARY': 'Salary ($)', 'Job_Category': 'Job Category'},
color_discrete_map={'Analytics Job': '#FF6B6B', 'Non-Analytics Job': '#4ECDC4'})
# Beautify the layout with a red-white theme (no gradients)
fig.update_layout(
width=900,
height=600,
plot_bgcolor='#FFFFFF', # Plain white background
paper_bgcolor='#FFFFFF', # Plain white background
font=dict(family="Inter, sans-serif", size=14, color="#2D3748"),
title=dict(
font=dict(size=24, color="#FF6B6B"), # Red title for theme
x=0.5,
xanchor="center",
y=0.95,
yanchor="top"
),
xaxis=dict(
title="Average Years of Experience",
title_font=dict(size=16),
tickfont=dict(size=12),
gridcolor="#E2E8F0",
linecolor="#2D3748",
linewidth=2,
showline=True,
showgrid=True,
zeroline=False
),
yaxis=dict(
title="Salary ($)",
title_font=dict(size=16),
tickfont=dict(size=12),
gridcolor="#E2E8F0",
linecolor="#2D3748",
linewidth=2,
showline=True,
showgrid=True,
zeroline=False
),
legend=dict(
title="Job Category",
font=dict(size=13),
bgcolor="#FFFFFF",
bordercolor="#FF6B6B", # Red border for theme
borderwidth=1,
x=1.02,
y=0.5,
xanchor="left",
yanchor="middle"
),
hovermode="closest",
hoverlabel=dict(
bgcolor="#FFFFFF",
font_size=12,
font_family="Inter, sans-serif",
font_color="#2D3748",
bordercolor="#FF6B6B" # Red border for hover
)
)
# Customize scatter points
fig.update_traces(
marker=dict(
size=8,
opacity=0.7,
line=dict(width=1, color="#2D3748")
)
)
fig.write_html(
"figures/edaplot4.html",
include_plotlyjs="cdn", # Use CDN to load Plotly JS
full_html=False # Only include the plot div
)import plotly.express as px
import pandas as pd
# Assuming eda is already loaded and DATA_ANALYST_JOB is defined
df = eda.copy()
# Step 1: Calculate average years of experience
df['Avg_Years_Experience'] = (df['MIN_YEARS_EXPERIENCE'] + df['MAX_YEARS_EXPERIENCE']) / 2
# Step 2: Clean the data (remove rows with missing values)
df = df.dropna(subset=['SALARY', 'Avg_Years_Experience', 'MIN_EDULEVELS_NAME', 'POSTED'])
# Debug: Check the number of rows after cleaning
print("Number of rows after cleaning:", len(df))
# Step 3: Map DATA_ANALYST_JOB to labels
df['Job_Category'] = df['DATA_ANALYST_JOB'].map({True: 'Analytics Job', False: 'Non-Analytics Job'})
# Step 4: Encode MIN_EDULEVELS_NAME numerically
edu_mapping = {'High School': 0, 'Bachelor\'s': 1, 'Master\'s': 2, 'PhD': 3} # Adjust based on your unique values
df['Edu_Level_Code'] = df['MIN_EDULEVELS_NAME'].map(edu_mapping)
# Step 5: Clean again after encoding
df = df.dropna(subset=['Edu_Level_Code'])
# Step 6: Extract a time component from POSTED (e.g., year or year-month)
df['POSTED'] = pd.to_datetime(df['POSTED']) # Ensure POSTED is in datetime format
df['Year_Month'] = df['POSTED'].dt.to_period('M').astype(str) # Extract year-month for animation
# Debug: Check the time range
print("Time range for animation:", df['Year_Month'].unique())
# Step 7: Create the 3D scatter plot with animation
fig = px.scatter_3d(
df,
x='Avg_Years_Experience',
y='SALARY',
z='Edu_Level_Code',
color='Job_Category',
color_discrete_map={'Analytics Job': '#FF6B6B', 'Non-Analytics Job': '#4ECDC4'},
animation_frame='Year_Month', # Animate over time
title='Salary, Experience, and Education Over Time by Job Category',
labels={
'Avg_Years_Experience': 'Avg. Years Experience',
'SALARY': 'Salary ($)',
'Edu_Level_Code': 'Min. Education Level (0=HS, 1=Bachelor\'s, 2=Master\'s, 3=PhD)'
},
hover_data=['Job_Category', 'Year_Month']
)
# Step 8: Update layout for styling
fig.update_layout(
width=900,
height=600,
scene=dict(
xaxis_title='Avg. Years Experience',
yaxis_title='Salary ($)',
zaxis_title='Min. Education Level',
xaxis=dict(tickfont=dict(size=12)),
yaxis=dict(tickfont=dict(size=12)),
zaxis=dict(tickfont=dict(size=12))
),
plot_bgcolor='white',
paper_bgcolor='white',
font=dict(family='Inter, sans-serif', size=14, color='#2D3748'),
title=dict(
font=dict(size=24, color='#FF6B6B'),
x=0.5,
xanchor='center',
y=0.95,
yanchor='top'
),
legend=dict(
title='Job Category',
font=dict(size=13),
bgcolor='#FFFFFF',
bordercolor='#FF6B6B',
borderwidth=1,
x=1.02,
y=0.5,
xanchor='left',
yanchor='middle'
)
)
# Save the file
fig.write_html(
"figures/edaplot5.html",
include_plotlyjs="cdn", # Use CDN to load Plotly JS
full_html=False # Only include the plot div
)Number of rows after cleaning: 71254
Time range for animation: []
import plotly.express as px
import pandas as pd
import numpy as np
# Assuming eda is already loaded and DATA_ANALYST_JOB is defined
df = eda.copy()
# Step 1: Clean the data (remove rows with missing STATE_NAME)
df = df.dropna(subset=['STATE_NAME'])
# Step 2: Handle any anomalies in STATE_NAME (e.g., "[Unknown City], AR" should not affect STATE_NAME)
# Since STATE_NAME is already provided (e.g., "Arkansas"), we can proceed directly
# Debug: Check the number of rows after cleaning and unique states
print("Number of rows after cleaning:", len(df))
print("Unique states extracted:", df['STATE_NAME'].unique())
# Step 3: Map DATA_ANALYST_JOB to labels
df['Job_Category'] = df['DATA_ANALYST_JOB'].map({True: 'Analytics Job', False: 'Non-Analytics Job'})
# Step 4: Aggregate data by state and job category
df_state_counts = df.groupby(['STATE_NAME', 'Job_Category']).size().reset_index(name='Job_Count')
# Step 5: Pivot the data to get counts for Analytics and Non-Analytics Jobs
df_pivot = df_state_counts.pivot(index='STATE_NAME', columns='Job_Category', values='Job_Count').fillna(0)
df_pivot['Total_Jobs'] = df_pivot.get('Analytics Job', 0) + df_pivot.get('Non-Analytics Job', 0)
df_pivot = df_pivot.reset_index()
# Debug: Check the aggregated data
print("Aggregated data by state:")
print(df_pivot)
# Step 6: Map state names to state codes for Plotly choropleth
state_name_to_code = {
'Alabama': 'AL', 'Alaska': 'AK', 'Arizona': 'AZ', 'Arkansas': 'AR', 'California': 'CA',
'Colorado': 'CO', 'Connecticut': 'CT', 'Delaware': 'DE', 'Florida': 'FL', 'Georgia': 'GA',
'Hawaii': 'HI', 'Idaho': 'ID', 'Illinois': 'IL', 'Indiana': 'IN', 'Iowa': 'IA',
'Kansas': 'KS', 'Kentucky': 'KY', 'Louisiana': 'LA', 'Maine': 'ME', 'Maryland': 'MD',
'Massachusetts': 'MA', 'Michigan': 'MI', 'Minnesota': 'MN', 'Mississippi': 'MS', 'Missouri': 'MO',
'Montana': 'MT', 'Nebraska': 'NE', 'Nevada': 'NV', 'New Hampshire': 'NH', 'New Jersey': 'NJ',
'New Mexico': 'NM', 'New York': 'NY', 'North Carolina': 'NC', 'North Dakota': 'ND', 'Ohio': 'OH',
'Oklahoma': 'OK', 'Oregon': 'OR', 'Pennsylvania': 'PA', 'Rhode Island': 'RI', 'South Carolina': 'SC',
'South Dakota': 'SD', 'Tennessee': 'TN', 'Texas': 'TX', 'Utah': 'UT', 'Vermont': 'VT',
'Virginia': 'VA', 'Washington': 'WA', 'West Virginia': 'WV', 'Wisconsin': 'WI', 'Wyoming': 'WY'
}
df_pivot['State_Code'] = df_pivot['STATE_NAME'].map(state_name_to_code)
# Step 7: Clean the data (remove rows with unmapped states)
df_pivot = df_pivot.dropna(subset=['State_Code'])
# Step 8: Apply a logarithmic scale to Total_Jobs for better visualization (avoiding zero values)
df_pivot['Log_Total_Jobs'] = np.log1p(df_pivot['Total_Jobs']) # log1p handles zero values (log(1+x))
# Debug: Check the final data before plotting
print("Final data for plotting:")
print(df_pivot[['STATE_NAME', 'State_Code', 'Total_Jobs', 'Log_Total_Jobs']])
# Step 9: Create the choropleth map with a logarithmic color scale
fig = px.choropleth(
df_pivot,
locations='State_Code',
locationmode='USA-states',
color='Log_Total_Jobs',
color_continuous_scale='Reds',
scope='usa',
title='Distribution of Jobs Across U.S. States (Log Scale)',
hover_data=['STATE_NAME', 'Analytics Job', 'Non-Analytics Job', 'Total_Jobs'],
labels={'Log_Total_Jobs': 'Log(Number of Jobs)', 'Total_Jobs': 'Number of Jobs'}
)
# Step 10: Update layout for styling
fig.update_layout(
width=900,
height=600,
plot_bgcolor='white',
paper_bgcolor='white',
font=dict(family='Inter, sans-serif', size=14, color='#2D3748'),
title=dict(
font=dict(size=24, color='#FF6B6B'),
x=0.5,
xanchor='center',
y=0.95,
yanchor='top'
),
geo=dict(
bgcolor='white',
lakecolor='white',
landcolor='lightgray',
subunitcolor='black',
showlakes=True,
showsubunits=True
),
coloraxis_colorbar=dict(
title='Log(Number of Jobs)',
title_font=dict(size=13),
tickfont=dict(size=12),
tickvals=[np.log1p(x) for x in [0, 10, 50, 100, 500, 1000]],
ticktext=['0', '10', '50', '100', '500', '1000']
)
)
# Step 11: Save to HTML
fig.write_html(
'figures/edaplot6.html',
include_plotlyjs='cdn',
full_html=False
)Number of rows after cleaning: 71254
Unique states extracted: ['Arkansas' 'Maine' 'Texas' 'Arizona' 'California' 'Ohio' 'New Jersey'
'New York' 'Hawaii' 'Georgia' 'Michigan' 'Mississippi' 'Massachusetts'
'Alaska' 'Alabama' 'Indiana' 'Virginia' 'South Carolina' 'Colorado'
'Nevada' 'Minnesota' 'Oregon' 'Oklahoma' 'North Carolina' 'Florida'
'Washington' 'Delaware' 'Illinois' 'Pennsylvania' 'Kansas' 'Tennessee'
'Washington, D.C. (District of Columbia)' 'Maryland' 'Idaho' 'Louisiana'
'Connecticut' 'Nebraska' 'Missouri' 'North Dakota' 'Utah' 'New Hampshire'
'Wisconsin' 'Kentucky' 'Rhode Island' 'Iowa' 'South Dakota' 'Montana'
'New Mexico' 'Wyoming' 'West Virginia' 'Vermont']
Aggregated data by state:
Job_Category STATE_NAME Analytics Job \
0 Alabama 376
1 Alaska 133
2 Arizona 673
3 Arkansas 252
4 California 3239
5 Colorado 646
6 Connecticut 390
7 Delaware 219
8 Florida 1526
9 Georgia 1144
10 Hawaii 140
11 Idaho 214
12 Illinois 1595
13 Indiana 442
14 Iowa 285
15 Kansas 330
16 Kentucky 279
17 Louisiana 230
18 Maine 193
19 Maryland 791
20 Massachusetts 987
21 Michigan 709
22 Minnesota 709
23 Mississippi 267
24 Missouri 587
25 Montana 105
26 Nebraska 221
27 Nevada 237
28 New Hampshire 134
29 New Jersey 1022
30 New Mexico 163
31 New York 1752
32 North Carolina 1284
33 North Dakota 96
34 Ohio 1149
35 Oklahoma 280
36 Oregon 468
37 Pennsylvania 962
38 Rhode Island 253
39 South Carolina 333
40 South Dakota 149
41 Tennessee 599
42 Texas 2967
43 Utah 350
44 Vermont 118
45 Virginia 1906
46 Washington 856
47 Washington, D.C. (District of Columbia) 668
48 West Virginia 76
49 Wisconsin 477
50 Wyoming 61
Job_Category Non-Analytics Job Total_Jobs
0 282 658
1 101 234
2 932 1605
3 299 551
4 3813 7052
5 786 1432
6 447 837
7 214 433
8 2078 3604
9 1481 2625
10 97 237
11 232 446
12 1912 3507
13 487 929
14 307 592
15 382 712
16 325 604
17 198 428
18 149 342
19 548 1339
20 1027 2014
21 1097 1806
22 740 1449
23 172 439
24 611 1198
25 78 183
26 291 512
27 319 556
28 133 267
29 1569 2591
30 91 254
31 1579 3331
32 1426 2710
33 50 146
34 1442 2591
35 298 578
36 593 1061
37 1284 2246
38 176 429
39 288 621
40 139 288
41 643 1242
42 5083 8050
43 259 609
44 108 226
45 1685 3591
46 744 1600
47 551 1219
48 80 156
49 544 1021
50 42 103
Final data for plotting:
Job_Category STATE_NAME State_Code Total_Jobs Log_Total_Jobs
0 Alabama AL 658 6.490724
1 Alaska AK 234 5.459586
2 Arizona AZ 1605 7.381502
3 Arkansas AR 551 6.313548
4 California CA 7052 8.861208
5 Colorado CO 1432 7.267525
6 Connecticut CT 837 6.731018
7 Delaware DE 433 6.073045
8 Florida FL 3604 8.190077
9 Georgia GA 2625 7.873217
10 Hawaii HI 237 5.472271
11 Idaho ID 446 6.102559
12 Illinois IL 3507 8.162801
13 Indiana IN 929 6.835185
14 Iowa IA 592 6.385194
15 Kansas KS 712 6.569481
16 Kentucky KY 604 6.405228
17 Louisiana LA 428 6.061457
18 Maine ME 342 5.837730
19 Maryland MD 1339 7.200425
20 Massachusetts MA 2014 7.608374
21 Michigan MI 1806 7.499423
22 Minnesota MN 1449 7.279319
23 Mississippi MS 439 6.086775
24 Missouri MO 1198 7.089243
25 Montana MT 183 5.214936
26 Nebraska NE 512 6.240276
27 Nevada NV 556 6.322565
28 New Hampshire NH 267 5.590987
29 New Jersey NJ 2591 7.860185
30 New Mexico NM 254 5.541264
31 New York NY 3331 8.111328
32 North Carolina NC 2710 7.905073
33 North Dakota ND 146 4.990433
34 Ohio OH 2591 7.860185
35 Oklahoma OK 578 6.361302
36 Oregon OR 1061 6.967909
37 Pennsylvania PA 2246 7.717351
38 Rhode Island RI 429 6.063785
39 South Carolina SC 621 6.432940
40 South Dakota SD 288 5.666427
41 Tennessee TN 1242 7.125283
42 Texas TX 8050 8.993552
43 Utah UT 609 6.413459
44 Vermont VT 226 5.424950
45 Virginia VA 3591 8.186464
46 Washington WA 1600 7.378384
48 West Virginia WV 156 5.056246
49 Wisconsin WI 1021 6.929517
50 Wyoming WY 103 4.644391